import numpy as np import pandas as pd import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import warnings import plotly.express as px import plotly.graph_objects as go from scipy.stats import gaussian_kde import plotly.figure_factory as ff import plotly.io as pio from plotly.subplots import make_subplots color = "seaborn" import ipywidgets as widgets from scipy import stats pio.templates.default = "plotly_white" import plotly.io as pio import plotly.figure_factory as ff from sklearn.model_selection import train_test_split, cross_val_score from xgboost import XGBRegressor pio.renderers.default='notebook'
# df = pd.read_csv('./data.csv',index_col = 0)
df = pd.read_pickle('./data/df_2017-5k.pkl')
# # df = pd.read_csv('./data.csv',index_col = 0)
# df = pd.read_pickle('./df_marathon.pkl')
# df.head(1)
df['finish_time'] = df['time_100'].map(lambda x: x[-1])
## Checking after finish timed added
## display the dataset
df
| hashedid | hashedathleteid | startdate | startdatelocal | totaldistance | elevgain | startlatapprox | startlngapprox | cumulative_elevation_gain | cumulative_elevation_loss | ... | geog | distance_100 | time_100 | elevation_100 | cadence_100 | heartrate_100 | pace_diff_100 | pace_cumul_100 | gap_100 | finish_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9a344e1b10a653c2cee3d40cabd615d46d2604198b15bb... | 197b3e2a67167805191c315e64640e5b3aa0dae4a7cba3... | 2017-10-29 09:10:29 | 2017-10-29 09:10:29 | 42311.3 | 271.600 | 53.33 | -6.25 | 235.0 | 187.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 38.98845, 75.44795, 112.36312, 143.05545... | [18.3, 17.900051, 17.956602, 17.997967, 17.303... | [] | [124.0, 138.00153, 158.49796, 172.15256, 183.9... | [1e-05, 6.682236, 6.02834, 5.416158, 4.943549,... | [25.0, 6.498075, 6.287327, 6.242397, 5.960643,... | [0.0, 6.828764, 6.009906, 5.404039, 5.133536, ... | 13093.2630 |
| 1 | 9eab2135ab8c8182d2e6eb36c87606efa3418d31a7b152... | 85ab08865f60f548c2a362aeadf7d2fac801b0b75d0c21... | 2017-10-29 09:10:39 | 2017-10-29 09:10:39 | 42461.3 | 269.913 | 53.33 | -6.25 | 221.0 | 202.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 38.141705, 75.420105, 112.58787, 140.797... | [18.0, 18.1182, 18.568665, 18.237965, 17.30003... | [1e-05, 60.776775, 78.99788, 78.99595, 79.3095... | [69.0, 118.00433, 115.79638, 117.72328, 118.03... | [1e-05, 6.744568, 5.696991, 4.160349, 4.65892,... | [25.0, 6.357691, 6.28458, 6.255208, 5.866589, ... | [0.0, 6.701542, 5.559884, 4.235624, 4.902388, ... | 14995.8400 |
| 2 | 9ec542cb1a44509fd87521c5743d400987df1433b48544... | ef33b1f6295728ab85be787754de13ef0f22d85546dc95... | 2017-10-29 09:02:03 | 2017-10-29 09:02:03 | 42396.6 | 263.140 | 53.33 | -6.25 | 219.0 | 203.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 34.051785, 63.714027, 99.33842, 126.0090... | [17.8, 17.979973, 18.573772, 18.525688, 18.340... | [84.0, 86.831635, 87.57081, 85.98568, 88.64971... | [] | [1e-05, 5.229298, 5.400293, 5.635104, 4.725854... | [25.0, 5.686955, 5.31107, 5.518765, 5.250323, ... | [0.0, 5.178599, 5.229725, 5.649802, 4.773427, ... | 13152.5990 |
| 3 | 9ec8e3139fcd0b60af668440d0560c30d4eec6faf9ede5... | ffa2c7a60a904ff047b58e43bb4a1c17e55fbbcf173127... | 2017-10-29 08:55:33 | 2017-10-29 08:55:33 | 42621.4 | 282.699 | 53.34 | -6.25 | 213.0 | 212.0 | ... | 0101000020E610000000000000000019C00000002085AB... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 363.61026, 395.8846, 423.8148, 459.01483... | [17.1, 18.10048, 18.188477, 18.501003, 18.2008... | [] | [] | [1e-05, 4.854006, 5.094511, 4.652123, 4.432908... | [25.0, 25.0, 25.0, 23.5453, 19.125616, 16.3804... | [0.0, 4.598914, 5.070293, 4.574117, 4.505633, ... | 12864.3190 |
| 4 | 9f0e9a093a823fcf03a22948c4e417c4de0a4bf126b865... | 3168b39d03feaa62356a49abbadf57ccec493d7a25de23... | 2017-10-29 09:00:25 | 2017-10-29 09:00:25 | 42489.4 | 195.000 | 53.33 | -6.25 | 217.0 | 206.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 33.269478, 64.696045, 98.46664, 124.9062... | [-132.2, -131.97264, -131.61813, -131.48433, -... | [60.0, 92.47492, 94.040146, 94.219345, 96.0676... | [108.0, 129.89734, 134.17978, 135.58492, 141.1... | [1e-05, 5.756402, 4.844161, 4.206395, 4.504928... | [25.0, 5.546336, 5.391422, 5.470671, 5.204367,... | [0.0, 5.685999, 4.752145, 4.17603, 4.401713, 4... | 11219.2430 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3161 | 9bf625b142d3122895b9883cead4af52d44e2173dfc7b5... | 92e8d744e080ee1f1f1c37894af12fdbc264b4335dc4eb... | 2017-10-29 09:01:53 | 2017-10-29 09:01:53 | 42501.1 | 265.380 | 53.33 | -6.25 | 209.0 | 214.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 30.884827, 61.23929, 93.71124, 123.22373... | [17.1, 18.095196, 18.597677, 18.302849, 18.451... | [1e-05, 79.952736, 85.98579, 84.81591, 87.0576... | [] | [1e-05, 5.281608, 4.70696, 5.168903, 4.654006,... | [25.0, 5.143316, 5.10306, 5.206595, 5.134331, ... | [0.0, 5.005465, 4.5808, 5.252191, 4.616716, 4.... | 12758.8955 |
| 3162 | 9c22d775800b2d1e4614329c62d3354960acf10466d603... | 59e8991532737aded81123b9162b968f44237d4c58aebb... | 2017-10-29 08:34:00 | 2017-10-29 08:34:00 | 22082.4 | 107.000 | 53.34 | -6.26 | 110.0 | 122.0 | ... | 0101000020E6100000000000803D0A19C00000002085AB... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 24.843252, 57.57859, 85.12957, 105.17841... | [99.2, 99.011215, 98.36667, 98.54853, 99.62743... | [59.0, 86.02894, 86.65374, 88.05222, 86.984566... | [] | [1e-05, 3.631549, 4.744741, 4.87496, 0.556868,... | [25.0, 4.142016, 4.796249, 4.729414, 4.382618,... | [0.0, 3.668899, 4.913648, 4.827204, 0.525386, ... | 12429.2870 |
| 3163 | 9c33c97ef0b99bd4f42cca563b8b73d16d1caf82569892... | c6325a263405cbe4861c5354921329594861aca902faa2... | 2017-10-29 08:11:24 | 2017-10-29 08:11:24 | 42655.7 | 211.800 | 53.33 | -6.25 | 192.0 | 189.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 38.05753, 76.406944, 119.077934, 154.057... | [56.8, 55.800014, 56.8, 56.8, 56.8, 56.8, 56.7... | [1e-05, 84.0, 83.10598, 83.0, 83.0, 82.99874, ... | [116.0, 128.04362, 129.0, 128.03062, 133.00156... | [1e-05, 5.612709, 6.130106, 6.102883, 5.490096... | [25.0, 6.342913, 6.367242, 6.615442, 6.419069,... | [0.0, 5.926004, 5.808106, 6.102883, 5.490096, ... | 15421.2430 |
| 3164 | 9c4f3f7c353748094548e6670cdc7028ca484d733f31b7... | c6fae963e810e1a1c11c771e2a77d6c88a6a5d41a3eb27... | 2017-10-29 09:31:39 | 2017-10-29 09:31:39 | 44568.3 | 288.207 | 53.34 | -6.25 | 240.0 | 205.0 | ... | 0101000020E610000000000000000019C00000002085AB... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 50.392284, 97.46559, 146.05293, 194.7328... | [17.7, 18.191647, 18.70834, 18.121914, 18.6573... | [] | [] | [1e-05, 7.141158, 9.09016, 19.81336, 8.482373,... | [25.0, 8.399151, 8.121722, 8.113903, 8.113851,... | [0.0, 6.95382, 8.839737, 20.45398, 8.240371, 2... | 24714.9730 |
| 3165 | 9c6d3c0560d73025be19d0f945e0a76714c11e0de580a8... | bf27b30646e3a79875c16c0d172bcdb272ce3cd4168a00... | 2017-10-29 09:01:40 | 2017-10-29 09:01:40 | 42488.3 | 186.000 | 53.33 | -6.25 | 218.0 | 206.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 29.10826, 60.614273, 102.052956, 131.584... | [5.6, 5.875436, 6.228689, 6.326708, 5.797177, ... | [1e-05, 89.92593, 90.01278, 85.70384, 87.6375,... | [107.0, 136.8759, 143.56119, 140.79216, 141.82... | [1e-05, 4.97735, 5.525431, 5.20914, 4.94926, 6... | [25.0, 4.852088, 5.051579, 5.669946, 5.482744,... | [0.0, 4.903712, 5.420839, 5.181566, 5.093511, ... | 12931.0580 |
3166 rows × 25 columns
## display full dataset with added column finish time
obs = 100#df.shape[0]
pd.set_option('display.max_rows', obs)
athlete_wid = widgets.Dropdown(
options=df.hashedid.unique(),
description='Athelte Ids:',
disabled=False,
)
display(athlete_wid)
Dropdown(description='Athelte Ids:', options=('9a344e1b10a653c2cee3d40cabd615d46d2604198b15bb62069effaee37ba27…
athlete_id = athlete_wid.value
row = df[df.hashedid==athlete_id].copy()
row
| hashedid | hashedathleteid | startdate | startdatelocal | totaldistance | elevgain | startlatapprox | startlngapprox | cumulative_elevation_gain | cumulative_elevation_loss | ... | geog | distance_100 | time_100 | elevation_100 | cadence_100 | heartrate_100 | pace_diff_100 | pace_cumul_100 | gap_100 | finish_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9a344e1b10a653c2cee3d40cabd615d46d2604198b15bb... | 197b3e2a67167805191c315e64640e5b3aa0dae4a7cba3... | 2017-10-29 09:10:29 | 2017-10-29 09:10:29 | 42311.3 | 271.6 | 53.33 | -6.25 | 235.0 | 187.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 38.98845, 75.44795, 112.36312, 143.05545... | [18.3, 17.900051, 17.956602, 17.997967, 17.303... | [] | [124.0, 138.00153, 158.49796, 172.15256, 183.9... | [1e-05, 6.682236, 6.02834, 5.416158, 4.943549,... | [25.0, 6.498075, 6.287327, 6.242397, 5.960643,... | [0.0, 6.828764, 6.009906, 5.404039, 5.133536, ... | 13093.263 |
1 rows × 25 columns
Following plot shows the counts of different heart rate values as well as Kernel Density Estimate curve
group_labels = ['distplot']
fig = ff.create_distplot(row.heartrate_100.tolist(),
group_labels,
show_hist=False)\
.add_traces(px.histogram(row.heartrate_100.item(),pattern_shape_sequence=[ "-"])
.update_traces(yaxis="y3", name="histogram")
.data)
fig.update_layout(yaxis3={"overlaying": "y", "side": "right"},
bargap=0.2, showlegend=False,
xaxis_title="Heart Rate",
title = "Heart Rate and KDE for an Individual")
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
Following plot shows distance on x-axis and heart rate and pace diff series on Y-axis. You can select either one of the series from Plotly pannel on top right to interact with just one series
### Pace Vs hearrate
row
| hashedid | hashedathleteid | startdate | startdatelocal | totaldistance | elevgain | startlatapprox | startlngapprox | cumulative_elevation_gain | cumulative_elevation_loss | ... | geog | distance_100 | time_100 | elevation_100 | cadence_100 | heartrate_100 | pace_diff_100 | pace_cumul_100 | gap_100 | finish_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9a344e1b10a653c2cee3d40cabd615d46d2604198b15bb... | 197b3e2a67167805191c315e64640e5b3aa0dae4a7cba3... | 2017-10-29 09:10:29 | 2017-10-29 09:10:29 | 42311.3 | 271.6 | 53.33 | -6.25 | 235.0 | 187.0 | ... | 0101000020E610000000000000000019C0000000803DAA... | [0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0... | [0.0, 38.98845, 75.44795, 112.36312, 143.05545... | [18.3, 17.900051, 17.956602, 17.997967, 17.303... | [] | [124.0, 138.00153, 158.49796, 172.15256, 183.9... | [1e-05, 6.682236, 6.02834, 5.416158, 4.943549,... | [25.0, 6.498075, 6.287327, 6.242397, 5.960643,... | [0.0, 6.828764, 6.009906, 5.404039, 5.133536, ... | 13093.263 |
1 rows × 25 columns
def plot_col_hr(row,x_col,y1_col,y2_col,if_col2 = True):
col,y1col,y2col = x_col.split("_")[0],y1_col.split("_")[0],y2_col.split("_")[0]
fig = go.Figure()
fig.add_trace(go.Scatter(x=row[x_col].item(), y=row[y1_col].item(),
mode='lines+markers',
name=f'{col} vs {y1col}'))
title = f"{col} vs {y1col}"
if if_col2:
fig.add_trace(go.Scatter(x=row[x_col].item(), y=row[y2_col].item(),
mode='lines+markers',
name=f'{col} vs {y2col}'))
title = f"{col} vs {y1col} and {y2col}"
fig.update_layout( xaxis_title=f"{col}",
yaxis_title=f"{y1col}",title = title,
dragmode="zoom",
hovermode="x",
legend=dict(traceorder="reversed"),)
fig.show()
x_col = 'distance_100'
y1_col = 'heartrate_100'
y2_col = ''
plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
Group by pace difference values, taking average heart rate values for each pace value.
x_col = 'pace_diff_100'
y1_col = 'heartrate_100'
y2_col = ''
mean_row_df = row[[x_col,y1_col]].copy()#.groupby(x_col).mean().reset_index()
# plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
# mean_row_df
cols = [x_col,y1_col]
mean_row_df = row.explode(cols)[cols].copy().groupby(x_col).mean().reset_index()
mean_row_df = mean_row_df.sort_values(by = x_col)
fig = go.Figure()
fig.add_trace(go.Scatter(x=mean_row_df[x_col], y=mean_row_df[y1_col],
mode='lines+markers',
name=f'Pace Difference vs Heart Rate'))
title = f"Pace Difference vs Heart Rate"
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)
fig.update_layout( xaxis_title='Pace Difference',
yaxis_title="Heart Rate",title = title,
dragmode="zoom",
hovermode="x",
legend=dict(traceorder="reversed"))
fig.show()
- From the above plot it is to be understood that the ralation between pace and heart rate inverse ralation
x_col = 'time_100'
y1_col = 'heartrate_100'
y2_col = ''
plot_col_hr(row,x_col,y1_col,y2_col,False)
x_col = 'time_100'
y1_col = 'pace_diff_100'
y2_col = ''
plot_col_hr(row,x_col,y1_col,y2_col,False)
we calculate the cardiac cost using heart rate and pace diff. Following plot shows distance on x-axis and heartrate and cardiac cost on y-axis
pdiff = np.array(row['pace_diff_100'].item())
pdiff[pdiff<1] = pdiff.mean()
row['pace_diff_100'] = [pdiff]
row['cardiac_cost_100'] = None
row['cardiac_cost_100'] = [np.array(row.heartrate_100.item())/np.array(row.pace_diff_100.item())/6000.]
x_col = 'distance_100'
y1_col = 'cardiac_cost_100'
y2_col = ''
plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
x_col = 'heartrate_100'
y1_col = 'cardiac_cost_100'
y2_col = ''
mean_row_df = row[[x_col,y1_col]].copy()#.groupby(x_col).mean().reset_index()
# plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
# mean_row_df
cols = [x_col,y1_col]
mean_row_df = row.explode(cols)[cols].copy().groupby(x_col).mean().reset_index()
mean_row_df = mean_row_df[(np.abs(stats.zscore(mean_row_df[x_col])) < 1)]
mean_row_df = mean_row_df.sort_values(by = x_col)
fig = go.Figure()
fig.add_trace(go.Scatter(x=mean_row_df[x_col], y=mean_row_df[y1_col],
mode='lines+markers',
name=f'Cardiac Cost vs Heart Rate'))
title = f"Cardiac Cost vs Heart Rate"
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)
fig.update_layout( xaxis_title="Heart Rate",
yaxis_title="Cardiac Cost",title = title,
dragmode="zoom",
hovermode="x",
legend=dict(traceorder="reversed"),)
fig.show()
Below plot shows pace difference on x axis and cardiac cost on y axis to identify possible trends
x_col = 'pace_diff_100'
y1_col = 'cardiac_cost_100'
y2_col = ''
mean_row_df = row[[x_col,y1_col]].copy()#.groupby(x_col).mean().reset_index()
# plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
# mean_row_df
cols = [x_col,y1_col]
mean_row_df = row.explode(cols)[cols].copy().groupby(x_col).mean().reset_index()
mean_row_df = mean_row_df[(np.abs(stats.zscore(mean_row_df[x_col])) < 2)]
mean_row_df = mean_row_df.sort_values(by = x_col)
fig = go.Figure()
fig.add_trace(go.Scatter(x=mean_row_df[x_col], y=mean_row_df[y1_col],
mode='lines+markers',
name=f'Pace Difference vs Cardiac Cost'))
title = f"Pace Difference vs Cardiac Cost"
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)
fig.update_layout( xaxis_title="Pace Difference",
yaxis_title="Cardiac Cost",title = title,
dragmode="zoom",
hovermode="x",
legend=dict(traceorder="reversed"),)
fig.show()
x_col = 'cardiac_cost_100'
y1_col = 'heartrate_100'
y2_col = 'pace_diff_100'
mean_row_df = row[[x_col,y1_col,y2_col]].copy()#.groupby(x_col).mean().reset_index()
# plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
# mean_row_df
cols = [x_col,y1_col,y2_col]
mean_row_df = row.explode(cols)[cols].copy().groupby(x_col).mean().reset_index()
mean_row_df = mean_row_df[(np.abs(stats.zscore(mean_row_df[x_col])) < 2)]
mean_row_df = mean_row_df.sort_values(by = x_col)
fig = go.Figure()
fig.add_trace(go.Scatter(x=mean_row_df[x_col], y=mean_row_df[y1_col],
mode='lines+markers',
name=f'cc vs heart rate'))
fig.add_trace(go.Scatter(x=mean_row_df[x_col], y=mean_row_df[y2_col],
mode='lines+markers',
name=f'cc vs pace diff'))
title = f"Cardiac Cost vs Heart rate and Pace"
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)
fig.update_layout( xaxis_title=x_col,
yaxis_title="pace | heart rate",title = title,
dragmode="zoom",
hovermode="x",
legend=dict(traceorder="reversed"),)
fig.show()
Following plot shows the heart rate variability with respect to distance
x_col = 'distance_100'
y1_col = 'heartrate_100'
y2_col = ""
plot_col_hr(row.copy(),x_col,y1_col,y2_col,if_col2=False)
The code provided generates a line plot depicting the relationship between Cardiac Cost and Heart Rate. The plot showcases how the two variables vary in relation to each other. Again we can see there is a lot of jittering and no clear trend
x_col = 'cardiac_cost_100'
y1_col = 'heartrate_100'
y2_col = ''
mean_row_df = row[[x_col,y1_col]].copy()#.groupby(x_col).mean().reset_index()
# plot_col_hr(row.copy(),x_col,y1_col,y2_col,False)
# mean_row_df
cols = [x_col,y1_col]
mean_row_df = row.explode(cols)[cols].copy().groupby(x_col).mean().reset_index()
mean_row_df = mean_row_df[(np.abs(stats.zscore(mean_row_df[x_col])) < 1)]
mean_row_df = mean_row_df.sort_values(by = x_col)
fig = go.Figure()
fig.add_trace(go.Scatter(x=mean_row_df[x_col], y=mean_row_df[y1_col],
mode='lines+markers',
name=f'Cardiac Cost vs Heart Rate'))
title = f"Cardiac Cost vs Heart Rate"
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)
fig.update_layout( xaxis_title="Cardiac Cost",
yaxis_title="Heart Rate",title = title,
dragmode="zoom",
hovermode="x",
legend=dict(traceorder="reversed"),)
fig.show()
The provided code segment detects peaks in the heart rate data and generates a plot showcasing the heart rates with detected peaks and the similarity of the heart rate with a QRS filter.
The detect_peaks() function takes heart rate data as input and uses a QRS filter to identify peaks. The function normalizes the data and calculates the correlation similarity with the QRS filter. Peaks above a certain threshold are considered, and their indices are returned along with the similarity values. The plot_hrp() function takes a row of data and extracts the heart rate and time values. It then calls the detect_peaks() function to obtain the peaks and similarity. The function plots the heart rates with detected peaks and the similarity with the QRS filter in separate subplots.
The resulting plot provides insights into the occurrence of peaks in the heart rate data and the similarity of the heart rate pattern with the QRS filter, which can be used for further analysis or interpretation.
#detecting peaks
def detect_peaks(heartrate, threshold=0.3, qrs_filter=None):
if qrs_filter is None:
# create default qrs filter, which is just a part of the sine function
t = np.linspace(1.5 * np.pi, 3.5 * np.pi, 15)
qrs_filter = np.sin(t)
heartrate = np.array(heartrate)
# normalize data
heartrate = (heartrate - heartrate.mean()) / heartrate.std()
similarity = np.correlate(heartrate, qrs_filter, mode="same")
similarity = similarity / np.max(similarity)
heartrate = heartrate.tolist()
peaks=[]
for i in range(len(heartrate)):
if heartrate[i]>threshold:
peaks.append(i)
return peaks, similarity
def plot_hrp(row):
heartrate_100 = row.heartrate_100
time_100 = row.time_100
l=[]
t=[]
l=heartrate_100
t=time_100
peaks, similarity = detect_peaks(l, threshold=0.3)
# Extract Peak points in heart rate
# plot 3 way
a=[]
c=[]
for i in range(len(peaks)):
b=peaks[i]
a.append(l[b])
for i in range(len(peaks)):
b=peaks[i]
c.append(t[b])
plt.figure(figsize=(20, 15))
plt.subplot(211)
plt.title("Heart rates with peaks")
plt.plot(c, a, label="Heartrate", color="#51A6D8",marker="o", linewidth=1)
plt.legend(loc="upper right")
plt.xlabel("Time ")
plt.ylabel("Heartrate")
plt.subplot(212)
plt.title('Similarity with QRS ')
plt.plot(t, similarity, label="Similarity with QRS ", color="olive", linewidth=1)
plt.legend(loc="upper right")
plt.xlabel("Time ")
plt.ylabel("Similarity (normalized)")
plot_hrp(df.iloc[0])
we are calculating the Cardiac cost for all individuals using the apply method.
# df[['heartrate_100','pace_diff_100']].head()
df = df[(df.heartrate_100.str.len()>0)&
(df.pace_diff_100.str.len()!=0)].copy()
df['cardiac_cost_100'] = df.apply(lambda row: np.array(row.heartrate_100)\
/np.array(row.pace_diff_100)/6000.
,axis=1)
df['avg_heart_rate'] = df.heartrate_100.map(lambda x : np.array(x).mean())
The provided code segment calculates heart rate zones based on certain criteria. Here's a breakdown of the steps involved:
Dropping Noisy Heart Rate Values: The code filters out heart rate values that fall below the minimum possible heart rate (MIN_HR_POSSIBLE) or exceed the maximum possible heart rate (MAX_HR_POSSIBLE). This is done to remove noisy or unrealistic heart rate data.
Computing the Maximum and Minimum Heart Rate: The maximum and minimum heart rates are computed based on the filtered heart rate values. These values will be used to define the heart rate zones.
Dividing into Equal Intervals: The heart rate range between the maximum and minimum values is divided into equal intervals of 5. These intervals will serve as the boundaries for the heart rate zones.
Computing the Zones of Different Intervals: The code computes the zones of different intervals based on the calculated zone limits. Each zone is represented by a minimum and maximum heart rate value. The resulting zones are stored in a DataFrame called 'zones_df'.
The 'zones_df' DataFrame contains the following columns:
## Calculating Heart Rate Zones
MIN_HR_POSSIBLE = 30
MAX_HR_POSSIBLE = 200
distance_100 = row.distance_100
# heart_rates = np.array([y for x in df.heartrate_100.values for y in x])
# heart_rates = heart_rates[(heart_rates>MIN_HR_POSSIBLE)&(heart_rates<MAX_HR_POSSIBLE)]
max_hr, min_hr = min(df.avg_heart_rate.max(),MAX_HR_POSSIBLE),max(df.avg_heart_rate.min(),MIN_HR_POSSIBLE)
limits = np.arange(0.5,1.01,0.1)
zone_limits = max_hr*limits
idxs = np.arange(0,2).reshape(1,-1) + np.arange(0,5).reshape(-1,1)
zone_limits = zone_limits[idxs]
zones_df = pd.DataFrame(zone_limits,columns = ['min_h','max_h'])
zones_df['height'] =zones_df['max_h'] - zones_df['min_h']
zones_df['zone'] = zones_df.index + 1
zones_df['zone'] = 'zone '+ zones_df.zone.astype('str')
zones_df
| min_h | max_h | height | zone | |
|---|---|---|---|---|
| 0 | 100.0 | 120.0 | 20.0 | zone 1 |
| 1 | 120.0 | 140.0 | 20.0 | zone 2 |
| 2 | 140.0 | 160.0 | 20.0 | zone 3 |
| 3 | 160.0 | 180.0 | 20.0 | zone 4 |
| 4 | 180.0 | 200.0 | 20.0 | zone 5 |
It takes the 'zones_df' DataFrame as input and specifies the following parameters:
x="min_h": The minimum heart rate values are plotted on the x-axis.y="max_h": The maximum heart rate values are plotted on the y-axis.size="height": The size of the markers is determined by the height of the heart rate zones, representing the difference between the maximum and minimum heart rate values.color="zone": Each zone is assigned a different color, which is reflected in the plot.log_x=True: The x-axis is plotted on a logarithmic scale, which can be useful when dealing with a wide range of heart rate values.size_max=60: Sets the maximum size of the markers.The resulting scatter plot visualizes the heart rate zones, with each zone represented by a marker. The position of the marker indicates the range of heart rates for that particular zone, while the size of the marker represents the height or width of the zone.
fig = px.scatter(zones_df, x="min_h", y="max_h",
size="height", color="zone"
, log_x=True, size_max=60)
fig.show()
import matplotlib.pyplot as plt
import seaborn as sns
def plot_hrz(row):
distance_100 = row.distance_100
heartrate_100 = row.heartrate_100
maxh = max(heartrate_100)
limit1 = maxh*0.50 # 50 percent of Maxh
limit2 = maxh*0.60 # 60 percent of Maxh
limit3 = maxh*0.70 # 70 percent of Maxh
limit4 = maxh*0.80 # 80 percent of Maxh
limit5 = maxh*0.90 # 90 percent of Maxh
zones = ["Zone 1\n({:.0f}-{:.0f})".format(limit1, limit2),
"Zone 2\n({:.0f}-{:.0f})".format(limit2, limit3),
"Zone 3\n({:.0f}-{:.0f})".format(limit3, limit4),
"Zone 4\n({:.0f}-{:.0f})".format(limit4, limit5),
"Zone 5\n({:.0f}-{:.0f})".format(limit5, maxh)]
values = [sum(1 for hr in heartrate_100 if hr >= limit1 and hr < limit2),
sum(1 for hr in heartrate_100 if hr >= limit2 and hr < limit3),
sum(1 for hr in heartrate_100 if hr >= limit3 and hr < limit4),
sum(1 for hr in heartrate_100 if hr >= limit4 and hr < limit5),
sum(1 for hr in heartrate_100 if hr >= limit5)]
plt.bar(zones, values)
plt.title("Heart Rate Zones for Athlete")
plt.xlabel("Heart Rate Zones")
plt.ylabel("Number of performer spend")
plt.show()
plot_hrz(df.iloc[0])
Below section calculates and visualizes zone-wise heart rate, cardiac cost, pace difference, and time for a performer. Here's a breakdown of the steps involved:
get_zone() function: This function takes a heart rate value as input and determines the corresponding heart rate zone based on the 'zones_df' DataFrame. It returns the zone if a match is found, or None otherwise.
plot_groups() function: This function plots the heart rate, cardiac cost, pace difference, and time for each zone. It takes two arguments: 'group' represents the performer group, and 'df_group' contains the data for that specific group.
## Zone wise heart rate for performer
### Zone Wise Heart Rate , Cardiac cost and Pace
def get_zone(hrate):
_zone_df = zones_df[(zones_df.max_h >= hrate) & (zones_df.min_h <= hrate)]
if not _zone_df.empty:
return _zone_df['zone'].iloc[-1]
return None
def plot_groups(group,df_group):
# create subplots with 1 row and 3 columns
zones = zones_df.zone.unique()
for zone in zones:
fig = make_subplots(rows=1, cols=4)
df_zone_group = df_group[df_group['zone']==zone]
if df_zone_group.empty or df_zone_group.shape[0]<2: #atleast 2 datapoints
continue
# loop over the three columns in df
for i, col in enumerate(['avg_cardiac_cost', 'avg_heart_rate', 'avg_time','avg_pace_diff']):
# add a boxplot for the current column to the subplot
fig.add_trace(px.box(df_zone_group[col], points="all", boxmode="overlay").data[0], row=1, col=i+1)
fig.update_xaxes(showgrid=False, row=1, col=i+1)
fig.update_yaxes(showgrid=False, row=1, col=i+1)
fig.update_layout(title = f"{zone} : Plotting Heart Rate, Pace Diff, Time and Cardiac cost for {group} Group")
fig.show()
df['zone'] = df.avg_heart_rate.map(lambda x:get_zone(x))
df['avg_cardiac_cost'] = df.cardiac_cost_100.map(lambda x : np.array(x).mean())
df['avg_pace_diff'] = df.pace_diff_100.map(lambda x : np.array(x).mean())
Below box plots show the Heart rate, cardiac cost, time and pace difference distribution for different individuals for zone 1. We can see that not many individuals heart rate belong to this zone
df['avg_time'] = df.time_100.apply(lambda x: np.array(x).mean())
def plot_zones(group,df_group):
# create subplots with 1 row and 3 columns
fig = make_subplots(rows=1, cols=3)
# loop over the three columns in df
for i, col in enumerate(['avg_cardiac_cost', 'avg_heart_rate','avg_pace_diff']):
# add a boxplot for the current column to the subplot
fig.add_trace(px.box(df_zone_group[col], points="all", boxmode="overlay").data[0], row=1, col=i+1)
fig.update_xaxes(showgrid=False, row=1, col=i+1)
fig.update_yaxes(showgrid=False, row=1, col=i+1)
fig.update_layout(title = f"{zone} : Plotting Heart Rate, Pace Diff and Cardiac cost")
fig.show()
zone = 'zone 1'
df_group = df[df.zone==zone].copy()
plot_groups(zone,df_group)
A lot of individuals heart rates are in this zones as compared to zone 1. Also There is a Clear difference between the distributions
zone = 'zone 2'
df_group = df[df.zone==zone].copy()
plot_groups(zone,df_group)
def plot_groups(group,df_group):
# create subplots with 1 row and 3 columns
zones = zones_df.zone.unique()
for zone in zones:
fig = make_subplots(rows=1, cols=4)
df_zone_group = df_group[df_group['zone']==zone]
if df_zone_group.empty or df_zone_group.shape[0]<2: #atleast 2 datapoints
continue
# loop over the three columns in df
for i, col in enumerate(['avg_cardiac_cost', 'avg_heart_rate', 'avg_time','avg_pace_diff']):
# add a boxplot for the current column to the subplot
fig.add_trace(px.box(df_zone_group[col], points="all", boxmode="overlay").data[0], row=1, col=i+1)
fig.update_xaxes(showgrid=False, row=1, col=i+1)
fig.update_yaxes(showgrid=False, row=1, col=i+1)
fig.update_layout(title = f"{zone} : Plotting Heart Rate, Pace Diff, Time and Cardiac cost for {group} Group")
fig.show()
Maximum number of athletes are in this zone. Also we can see that the distribution for heart rate especially is a lot different from first two zones
zone = 'zone 3'
df_group = df[df.zone==zone].copy()
plot_groups(zone,df_group)
Like zone 3 a lot of individuals belong to zone 4. Cardiac cost distribution is pretty much the same as zone 3 whereas other distributions are different.
zone = 'zone 4'
df_group = df[df.zone==zone].copy()
plot_groups(zone,df_group)
Individuals belonging to Zone 5 demonstrate a high level of exertion and intensity in their performance. This zone represents individuals who are pushing themselves to their physical limits and may indicate a strong level of fitness or a competitive drive.
zone = 'zone 5'
df_group = df[df.zone==zone].copy()
plot_groups(zone,df_group)
To analyze the performance of athletes based on their completion time, we can divide the athletes into three groups: slow, medium, and fast. The division criteria are as follows:
This grouping allows us to compare the performance levels across different time intervals and gain insights into the distribution of athletes and their performance characteristics within each group.
df['finish_time_hrs'] = df['finish_time']/3600
SLOW,MED,FAST = 'slow','medium','fast'
# df['finish_time_hrs'] = pd.to_datetime(df['finish_time'], unit='m').dt.time
def div_perform(finish_time):
if finish_time < 3.5:
return FAST
elif 3.5 < finish_time <4.5:
return MED
else:
return SLOW
df['group'] = df.finish_time_hrs.map(lambda x : div_perform(x))
# get the value counts of the 'group' column
value_counts = df['group'].value_counts(dropna=False)
# create a pie chart using plotly express
fig = px.pie(names=value_counts.index, values=value_counts.values, hole=0.5)
# set the title and hover text
fig.update_layout(
title="Counts of Each Group",
hovermode="closest",
annotations=[dict(text=f"Athlete Groups",
x=0.5, y=0.5, font_size=20, showarrow=False)
for count in value_counts.values],
)
fig.update_traces(textinfo='value')
# show the plot
fig.show()
plotting Heart Rate, Pace diff, Avg. Time and Cardiac cost for each group and for each zone. Some of the zones are not shown in the plots below because no athelete in this group belong to that zone
group = 'slow'
df_group = df[df.group==group].copy()
plot_groups(group,df_group)
To analyze the distribution of different zones within each group based on a specific column (e.g., heart rate), we can follow the steps outlined below:
By following these steps, we can visualize the distribution of different zones within each group and understand the variation in the chosen column's values across the zones. This analysis provides insights into how performers in different zones differ from each other and can help identify patterns and trends related to the chosen column.
Below plots the distribution of cardiac cost for different zones within the slow group using a box plot. The zones are sorted and represented on the x-axis, while the average cardiac cost is shown on the y-axis.
df_slow = df[df.group=='slow']
df_slow = df_slow[df_slow.avg_heart_rate<181.813229] # remove this check later
df_slow.loc[df_slow.zone.isnull(),'zone'] = 'zone 1'
df_slow = df_slow.sort_values(by='zone')
fig = px.box(df_slow, x="zone", y="avg_cardiac_cost",points="all")
fig.update_layout(title = f"Cardiac Cost for different Zones for Slow Group")
fig.show()
A box plot to visualize the distribution of heart rate for different zones within the slow group. The zones are sorted and displayed on the x-axis, while the average heart rate is represented on the y-axis.
df_slow = df[df.group=='slow']
df_slow = df_slow[df_slow.avg_heart_rate<181.813229] # remove this check later
df_slow.loc[df_slow.zone.isnull(),'zone'] = 'zone 1'
df_slow = df_slow.sort_values(by='zone')
fig = px.box(df_slow, x="zone", y="avg_heart_rate",points="all")
fig.update_layout(title = f"Heart Rate for different Zones for Slow Group")
fig.show()
df_slow = df[df.group=='slow']
df_slow = df_slow[df_slow.avg_heart_rate<181.813229] # remove this check later
df_slow.loc[df_slow.zone.isnull(),'zone'] = 'zone 1'
df_slow = df_slow.sort_values(by='zone')
fig = px.box(df_slow, x="zone", y="avg_pace_diff",points="all")
fig.update_layout(title = f"Pace Difference for different Zones for Slow Group")
fig.show()
df_slow = df[df.group=='slow']
df_slow = df_slow[df_slow.avg_heart_rate<181.813229] # remove this check later
df_slow.loc[df_slow.zone.isnull(),'zone'] = 'zone 1'
df_slow = df_slow.sort_values(by='zone')
fig = px.box(df_slow, x="zone", y="avg_time",points="all")
fig.update_layout(title = f"Time for different Zones for Slow Group")
fig.show()
To visualize the cardiac cost for each zone within the slow group, we generate scatter plot. The zones are represented on the x-axis, the average cardiac cost is shown on the y-axis, and each data point is colored based on the respective zone. Additionally, the size of the data points corresponds to the finish time in hours. Hovering over the data points provides additional information such as the average cardiac cost.
fig = px.scatter(df_slow, x="zone", y="avg_cardiac_cost", color="zone",
size='finish_time_hrs', hover_data=['avg_cardiac_cost'])
fig.show()
To visualize the Time for each zone within the slow group, we generate scatter plot. The zones are represented on the x-axis, the average Time is shown on the y-axis, and each data point is colored based on the respective zone.
fig = px.scatter(df_slow, x="zone", y="avg_time", color="zone",
size='finish_time_hrs', hover_data=['avg_cardiac_cost'])
fig.show()
fig = px.scatter(df_slow, x="zone", y="avg_pace_diff", color="zone",
size='finish_time_hrs', hover_data=['avg_cardiac_cost'])
fig.show()
fig = px.scatter(df_slow, x="zone", y="avg_heart_rate", color="zone",
size='finish_time_hrs', hover_data=['avg_cardiac_cost'])
fig.show()
fig = px.scatter(df_slow, x="finish_time_hrs", y="avg_cardiac_cost", color="zone",
size='finish_time_hrs', hover_data=['avg_cardiac_cost'])
fig.show()
fig = px.scatter(df_slow, x="finish_time_hrs", y="avg_time", color="zone",
size='finish_time_hrs', hover_data=['avg_time'])
fig.show()
fig = px.scatter(df_slow, x="finish_time_hrs", y="avg_heart_rate", color="zone",
size='finish_time_hrs', hover_data=['avg_heart_rate'])
fig.show()
fig = px.scatter(df_slow, x="finish_time_hrs", y="avg_pace_diff", color="zone",
size='finish_time_hrs', hover_data=['avg_pace_diff'])
fig.show()
To analyze the heart rate, pace difference, time, and cardiac cost for each zone within the medium group, the code below utilizes subplots. The subplots are organized in a grid with 1 row and 4 columns, each representing a specific metric. Within each subplot, box plots are used to display the distribution of data points for the corresponding metric.
group = 'medium'
df_group = df[df.group==group].copy()
plot_groups(group,df_group)
To analyze the pace difference for each zone within the medium group, the code generates a box plot. The x-axis represents the different zones, while the y-axis represents the average pace difference.
### Plotting Pace Difference for all zones in Medium group
df_med = df[df.group=='medium']
df_med = df_med.sort_values(by='zone')
fig = px.box(df_med, x="zone", y="avg_pace_diff",points="all")
fig.update_layout(title = f"Pace Difference for different Zones for Medium Group")
fig.show()
df_med = df[df.group=='medium']
df_med = df_med.sort_values(by='zone')
fig = px.box(df_med, x="zone", y="avg_time",points="all")
fig.update_layout(title = f"Time for different Zones for Medium Group")
fig.show()
df_med = df[df.group=='medium']
df_med = df_med.sort_values(by='zone')
fig = px.box(df_med, x="zone", y="avg_heart_rate",points="all")
fig.update_layout(title = f"Heart Rate for different Zones for Medium Group")
fig.show()
df_med = df[df.group=='medium']
df_med = df_med.sort_values(by='zone')
fig = px.box(df_med, x="zone", y="avg_cardiac_cost",points="all")
fig.update_layout(title = f"Cardiac Cost for different Zones for Medium Group")
fig.show()
cols = ['avg_cardiac_cost','avg_heart_rate','avg_pace_diff','avg_time']
for col in cols:
fig = px.scatter(df_med, x="zone", y=col, color="zone",
size='finish_time_hrs', hover_data=[col])
fig.update_layout(title = f"{col} for different Zones for Medium Group")
fig.show()
cols = ['avg_cardiac_cost','avg_heart_rate','avg_pace_diff','avg_time']
for col in cols:
fig = px.scatter(df_med, x="finish_time_hrs", y=col, color="zone",
size='finish_time_hrs', hover_data=[col])
fig.update_layout(title = f"{col} vs Finish Time(hrs) for different Zones in Medium Group")
fig.show()
The following code segment plots the cardiac cost, time, pace difference, and heart rate for the fast group across different zones. The purpose is to analyze the performance characteristics within each zone of the fast group. The resulting plot displays box plots for each metric, with the x-axis representing the zones and the y-axis representing the corresponding metric values.
df_fast = df[df.group=='fast']
df_fast = df_fast.sort_values(by='zone')
fig = px.box(df_fast, x="zone", y="avg_time",points="all")
fig.update_layout(title = f"Time for different Zones for fast Group")
fig.show()
df_fast = df[df.group=='fast']
df_fast = df_fast.sort_values(by='zone')
fig = px.box(df_fast, x="zone", y="avg_cardiac_cost",points="all")
fig.update_layout(title = f"Cardiac Cost for different Zones for fast Group")
fig.show()
df_fast = df[df.group=='fast']
df_fast = df_fast.sort_values(by='zone')
fig = px.box(df_fast, x="zone", y="avg_heart_rate",points="all")
fig.update_layout(title = f"Heart Rate for different Zones for fast Group")
fig.show()
df_fast = df[df.group=='fast']
df_fast = df_fast.sort_values(by='zone')
fig = px.box(df_fast, x="zone", y="avg_pace_diff",points="all")
fig.update_layout(title = f"Pace Difference for different Zones for fast Group")
fig.show()
The following code segment plots scatter plots for the cardiac cost, heart rate, pace difference, and time for the fast group across different zones. The purpose is to visualize the distribution and relationship of each metric within each zone of the fast group. Each scatter plot has the zone on the x-axis, the corresponding metric on the y-axis, and different zones represented by different colors.
cols = ['avg_cardiac_cost','avg_heart_rate','avg_pace_diff','avg_time']
for col in cols:
fig = px.scatter(df_fast, x="zone", y=col, color="zone",
size='finish_time_hrs', hover_data=[col])
fig.update_layout(title = f"{col} for different Zones for Fast Group")
fig.show()
cols = ['avg_cardiac_cost','avg_heart_rate','avg_pace_diff','avg_time']
for col in cols:
fig = px.scatter(df_fast, x="finish_time_hrs", y=col, color="zone",
size='finish_time_hrs', hover_data=[col])
fig.update_layout(title = f"{col} vs Finish Time(hrs) for different Zones in Fast Group")
fig.show()
group = 'fast'
df_group = df[df.group==group].copy()
plot_groups(group,df_group)
The code segment calculates the average values of cardiac cost, pace difference, and heart rate for individuals in each group and zone. It is a table that shows aggregated information for zones and metrics
df_grouped = df[['group','zone','avg_cardiac_cost',
'avg_pace_diff','avg_heart_rate']].groupby(['group','zone']).mean().copy()
df_grouped
| avg_cardiac_cost | avg_pace_diff | avg_heart_rate | ||
|---|---|---|---|---|
| group | zone | |||
| fast | zone 1 | 3.666077 | 4.177152 | 114.064704 |
| zone 2 | 6.422354 | 4.344681 | 133.020681 | |
| zone 3 | 6.060830 | 4.639915 | 153.138853 | |
| zone 4 | 8.383007 | 4.590122 | 168.096919 | |
| zone 5 | 7.225559 | 4.406520 | 183.827969 | |
| medium | zone 1 | 12.267995 | 7.153221 | 114.907737 |
| zone 2 | 8.663070 | 5.785085 | 133.908162 | |
| zone 3 | 8.005074 | 5.634214 | 153.276322 | |
| zone 4 | 7.240500 | 5.599993 | 167.687618 | |
| zone 5 | 5.696055 | 5.558052 | 183.376041 | |
| slow | zone 1 | 8.667584 | 7.709010 | 113.950045 |
| zone 2 | 8.799938 | 7.162069 | 133.510517 | |
| zone 3 | 8.280546 | 7.149391 | 151.948541 | |
| zone 4 | 9.003134 | 6.858199 | 167.265080 | |
| zone 5 | 9.772631 | 7.252094 | 183.041151 |
We are also using a 3D scatter plot that visualizes the relationship between average cardiac cost, average pace difference, and average heart rate for different athlete groups. The aim is to explore the patterns and differences between these three variables across the groups.
fig = go.Figure()
for group in df_grouped.index.get_level_values('group').unique():
group_data = df_grouped.loc[group]
fig.add_trace(go.Scatter3d(
x=group_data['avg_cardiac_cost'],
y=group_data['avg_pace_diff'],
z=group_data['avg_heart_rate'],
mode='markers',
name=group,
text=group_data.index.get_level_values('zone'),
marker=dict(
size=10,
opacity=0.8
)
))
fig.update_layout(scene=dict(
xaxis_title='Avg Cardiac Cost',
yaxis_title='Avg Pace Diff',
zaxis_title='Avg Heart Rate'))
fig.show()
The analysis is based on four key metrics - cardiac cost, pace difference, heart rate, and time spent - and considers all athletes belonging to the slow group, comparing their performance across different zones.
The visualization provides valuable insights into the performance of athletes belonging to the "slow" group. The results show that the athletes in this group spent the majority of their time in zone 2 and zone 3. Additionally, when the heart rate values of athletes in the slow group are combined, the weighted average is considerably lower than that of other groups. These findings suggest that athletes in the slow group may benefit from adjusting their training to target higher heart rate zones, which could potentially lead to improved performance. Overall, this visualization provides valuable insights into the performance of athletes in the slow group and can be used to inform future training strategies for this group.
def plot_zone_cols(group):
df_ = df[df.group==group].copy()
for col in ['avg_heart_rate','avg_cardiac_cost','avg_time','avg_pace_diff']:
avg_col_df_raw = df_[['zone',col]].groupby(['zone']).mean()
avg_col_df_raw = avg_col_df_raw.reset_index()
num_performers = df_.hashedathleteid.nunique()
avg_col_df = df_[['zone',col]].groupby(['zone']).sum()/num_performers
avg_col_df = avg_col_df.rename(columns={col:f'{col} - (combined)'})
avg_col_df = avg_col_df.reset_index().merge(avg_col_df_raw)
avg_col_df = pd.concat([avg_col_df,
pd.DataFrame([['zone 1',1e-7,1e-7]],columns=['zone',f'{col} - (combined)',col])])
avg_col_df = avg_col_df.sort_values(by='zone').reset_index(drop=True)
display(avg_col_df)
# Create a histogram with heart rate columns side by side
fig = px.histogram(avg_col_df, x="zone", y=[f'{col} - (combined)', col],
barmode="group",
labels={"value": col},
title=f"Distribution of {col} by Zone",
color_discrete_sequence=["#636EFA", "#EF553B"])
# Adjust the bar gaps
fig.update_layout(bargap=0.1, bargroupgap=0.2)
# Show the plot
fig.show()
group = 'slow'
plot_zone_cols(group)
| zone | avg_heart_rate - (combined) | avg_heart_rate | |
|---|---|---|---|
| 0 | zone 1 | 1.068282e+00 | 1.139500e+02 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 1.460271e+01 | 1.335105e+02 |
| 3 | zone 3 | 6.172909e+01 | 1.519485e+02 |
| 4 | zone 4 | 7.422388e+01 | 1.672651e+02 |
| 5 | zone 5 | 2.288014e+00 | 1.830412e+02 |
| zone | avg_cardiac_cost - (combined) | avg_cardiac_cost | |
|---|---|---|---|
| 0 | zone 1 | 8.125860e-02 | 8.667584e+00 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 9.624933e-01 | 8.799938e+00 |
| 3 | zone 3 | 3.363972e+00 | 8.280546e+00 |
| 4 | zone 4 | 3.995141e+00 | 9.003134e+00 |
| 5 | zone 5 | 1.221579e-01 | 9.772631e+00 |
| zone | avg_time - (combined) | avg_time | |
|---|---|---|---|
| 0 | zone 1 | 8.851202e+01 | 9.441282e+03 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 9.758279e+02 | 8.921855e+03 |
| 3 | zone 3 | 3.553194e+03 | 8.746323e+03 |
| 4 | zone 4 | 3.744831e+03 | 8.439057e+03 |
| 5 | zone 5 | 1.119820e+02 | 8.958560e+03 |
| zone | avg_pace_diff - (combined) | avg_pace_diff | |
|---|---|---|---|
| 0 | zone 1 | 7.227197e-02 | 7.709010e+00 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 7.833513e-01 | 7.162069e+00 |
| 3 | zone 3 | 2.904440e+00 | 7.149391e+00 |
| 4 | zone 4 | 3.043326e+00 | 6.858199e+00 |
| 5 | zone 5 | 9.065117e-02 | 7.252094e+00 |
The analysis focuses on four essential metrics: cardiac cost, pace difference, heart rate, and time spent. It examines the performance of all athletes in the medium group and compares it across various zones. By doing so, it aims to identify patterns and trends that could help optimize training and competition strategies.
The visualization presented below offers significant insights into the performance of athletes categorized as "medium." The findings reveal that these athletes primarily spent their time in zone 2 and zone 3. Furthermore, when the heart rate data of athletes in the "medium" group are aggregated, the weighted average is considerably higher than that of the "slow" group, indicating that the former group puts in more effort and has significantly higher zone 2 and zone 3 values. Additionally, their time spent is lower than that of the "slow" group, which enables them to finish more quickly.
group = 'medium'
plot_zone_cols(group)
| zone | avg_heart_rate - (combined) | avg_heart_rate | |
|---|---|---|---|
| 0 | zone 1 | 5.229021e-01 | 1.149077e+02 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 3.656196e+00 | 1.339082e+02 |
| 3 | zone 3 | 6.207778e+01 | 1.532763e+02 |
| 4 | zone 4 | 8.928078e+01 | 1.676876e+02 |
| 5 | zone 5 | 4.798235e+00 | 1.833760e+02 |
| zone | avg_cardiac_cost - (combined) | avg_cardiac_cost | |
|---|---|---|---|
| 0 | zone 1 | 5.582705e-02 | 1.226799e+01 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 2.365343e-01 | 8.663070e+00 |
| 3 | zone 3 | 3.242100e+00 | 8.005074e+00 |
| 4 | zone 4 | 3.855010e+00 | 7.240500e+00 |
| 5 | zone 5 | 1.490435e-01 | 5.696055e+00 |
| zone | avg_time - (combined) | avg_time | |
|---|---|---|---|
| 0 | zone 1 | 3.244101e+01 | 7.128913e+03 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 1.909475e+02 | 6.993454e+03 |
| 3 | zone 3 | 2.804767e+03 | 6.925254e+03 |
| 4 | zone 4 | 3.690299e+03 | 6.931139e+03 |
| 5 | zone 5 | 1.808072e+02 | 6.909979e+03 |
| zone | avg_pace_diff - (combined) | avg_pace_diff | |
|---|---|---|---|
| 0 | zone 1 | 3.255163e-02 | 7.153221e+00 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 1.579545e-01 | 5.785085e+00 |
| 3 | zone 3 | 2.281889e+00 | 5.634214e+00 |
| 4 | zone 4 | 2.981566e+00 | 5.599993e+00 |
| 5 | zone 5 | 1.454325e-01 | 5.558052e+00 |
The analysis is based on four key metrics - cardiac cost, pace difference, heart rate, and time spent - and considers all athletes belonging to the slow group, comparing their performance across different zones. Similarly for Fast group, the data reveals that this group spends a considerable amount of time in zone 3, indicating that they maintain a steady pace throughout the duration of the marathon. This steady pace and lack of frequent fluctuation could be a contributing factor to their superior performance compared to the other two groups.Additionally, the Fast group prefers to spend most of their time in zone 4 in terms of cardiac cost, which suggests that they are able to sustain a high level of effort for longer periods of time. These findings provide valuable insights into the training and competition strategies that may be effective for athletes looking to improve their performance.
group = 'fast'
plot_zone_cols(group)
| zone | avg_heart_rate - (combined) | avg_heart_rate | |
|---|---|---|---|
| 0 | zone 1 | 7.129044e-01 | 1.140647e+02 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 4.988276e+00 | 1.330207e+02 |
| 3 | zone 3 | 5.710803e+01 | 1.531389e+02 |
| 4 | zone 4 | 9.385411e+01 | 1.680969e+02 |
| 5 | zone 5 | 3.829749e+00 | 1.838280e+02 |
| zone | avg_cardiac_cost - (combined) | avg_cardiac_cost | |
|---|---|---|---|
| 0 | zone 1 | 2.291298e-02 | 3.666077e+00 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 2.408383e-01 | 6.422354e+00 |
| 3 | zone 3 | 2.260184e+00 | 6.060830e+00 |
| 4 | zone 4 | 4.680512e+00 | 8.383007e+00 |
| 5 | zone 5 | 1.505325e-01 | 7.225559e+00 |
| zone | avg_time - (combined) | avg_time | |
|---|---|---|---|
| 0 | zone 1 | 3.254211e+01 | 5.206738e+03 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 1.845386e+02 | 4.921030e+03 |
| 3 | zone 3 | 2.088584e+03 | 5.600671e+03 |
| 4 | zone 4 | 3.090927e+03 | 5.535988e+03 |
| 5 | zone 5 | 1.021278e+02 | 4.902133e+03 |
| zone | avg_pace_diff - (combined) | avg_pace_diff | |
|---|---|---|---|
| 0 | zone 1 | 2.610720e-02 | 4.177152e+00 |
| 1 | zone 1 | 1.000000e-07 | 1.000000e-07 |
| 2 | zone 2 | 1.629255e-01 | 4.344681e+00 |
| 3 | zone 3 | 1.730302e+00 | 4.639915e+00 |
| 4 | zone 4 | 2.562818e+00 | 4.590122e+00 |
| 5 | zone 5 | 9.180250e-02 | 4.406520e+00 |
To explore the relationship between the average pace difference and average heart rate for the 'slow' group, we utilize a custom function called plot_relation. This function generates two insightful plots:
Scatter Plot: The scatter plot displays individual data points of average pace difference versus average heart rate for the 'slow' group. Additionally, a trendline is fitted using ordinary least squares regression to identify any potential trends or patterns.
Correlation Heatmap: The heatmap represents the correlation matrix between average pace difference and average heart rate. The color intensity reflects the strength and direction of the correlation, with a colorbar providing a clear visual guide. Annotations within the heatmap display the precise correlation values.
After defining the plot_relation function, we create a subset of the main DataFrame, df, by filtering for the 'slow' group. This subset, named df_slow, is then passed as input to the plot_relation function, resulting in the generation of the aforementioned plots specific to the 'slow' group.
These visualizations provide valuable insights into the relationship and correlation between average pace difference and average heart rate for the 'slow' group, enabling further analysis and interpretation.
def plot_relation(df):
fig = px.scatter(df, x='avg_pace_diff', y='avg_heart_rate', trendline='ols')
fig.update_layout()
fig.update_layout(
title='Relation between Pace Diff and Heart Rate'
)
fig.show()
cols = ['avg_heart_rate','avg_pace_diff']
corr = df[cols].corr()
fig = go.Figure(data=go.Heatmap(
z=corr.values,
x=corr.columns,
y=corr.index,
colorscale='RdBu',
zmin=-1,
zmax=1,
colorbar=dict(title='Correlation'),
text=corr.round(2),
hovertemplate='Correlation: %{text:.2f}<extra></extra>'
))
annotations = []
for i, row in enumerate(corr.index):
for j, col in enumerate(corr.columns):
annotation = dict(
x=col,
y=row,
text=f'{corr.iloc[i, j]:.2f}',
font=dict(color='black'),
showarrow=False
)
annotations.append(annotation)
fig.update_layout(
title='Correlation Heatmap b/w Pace Diff and Heart Rate',
annotations=annotations
)
fig.show()
df_slow = df[df.group=='slow'].copy()
plot_relation(df_slow)
df_med = df[df.group=='medium'].copy()
plot_relation(df_med)
df_fast = df[df.group=='fast'].copy()
plot_relation(df_fast)
XGBoost (eXtreme Gradient Boosting) is a gradient boosting algorithm that is widely used in machine learning for supervised learning problems, especially in structured data. It has great computational speed and efficiency, so we expect higher accuracy with the XGBoost machine learning model compared to other models (such as the CBR models of Feely et al., 2020).
The algorithm works by iteratively building decision trees that correct the errors made by the previous trees. In each iteration, the algorithm focuses on the examples that were poorly predicted in the previous iteration and builds a new tree to improve the prediction for those examples. This process is repeated until the predictions are good enough or until a predefined number of iterations (trees) is reached.
The first two terms of the equation represent a linear regression model, while the last term represents the non-linear component introduced by the decision tree-based weak learners.
We have experimented with different hyperparameters for the XGBoost algorithm while using cardiac cost and heart rate to predict the finish time. The following hyperparameters were considered:
Heart rate is an important factor in predicting the marathon performance of an athlete. We have heart rate values for each individual throughout the marathon, which can be used to predict their competition time. Heart rate variability, which is the difference between the maximum and minimum heart rate values, is also an important metric for analyzing marathon performance. A high heart rate variability indicates that the athlete's heart rate is fluctuating significantly, which may impact their performance. Therefore, it is important for the network to extract these features from the heart rate data in order to better predict the marathon performance of the athletes.
def hr_group_pred(df):
col = 'heartrate_100'
df[col] = df[col].apply(lambda x: x[:len(x)//2])
df_col = pd.DataFrame(df[col].tolist())
# there are a couple of noise values in df so just filling them with mean
df_col = df_col.fillna(df_col.mean())
print("Extracted heart rate column and created a new dataframe by exploding the previous df")
display(df_col.head(2))
num_cols = df_col.shape[1]
print("dividing data into test train split")
X_train, X_test, y_train, y_test = train_test_split(df_col, df['finish_time_hrs'], test_size=0.4,shuffle = False)
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test).flatten()
mse = np.mean((y_pred - y_test)**2)
print(f"Mean Squared Error: {mse}")
# create a scatter plot with your data and predictions
# add a line plot for your xgboost predictions
# fig.add_trace(go.Scatter(x=df['x'], y=df['preds'], mode='lines', name='XGBoost Predictions'))
df_ = pd.DataFrame()
df_['avg_heart_rate'] = df.iloc[y_test.index[0]:]['avg_heart_rate']
df_['y_test'] = y_test
df_['y_pred'] = y_pred
df_ = df_.sort_values(by='avg_heart_rate')
#create scatterplot with regression line
sns.regplot(data = df_, x='avg_heart_rate', y = 'y_pred')
# returing test set and predictions
return y_test,y_pred,mse
Cardiac Cost is an important factor in predicting the marathon performance of an athlete. We have heart rate values for each individual throughout the marathon, which can be used to calculate the cardiac cost to predict their competition time.
def cc_group_pred(df):
col = 'cardiac_cost_100'
df[col] = df[col].apply(lambda x: x[:len(x)//2])
df_col = pd.DataFrame(df[col].tolist())
# there are a couple of noise values in df so just filling them with mean
df_col = df_col.fillna(df_col.mean())
print("Extracted Cardiac Cost column and created a new dataframe by exploding the previous df")
display(df_col.head(2))
num_cols = df_col.shape[1]
print("dividing data into test train split")
X_train, X_test, y_train, y_test = train_test_split(df_col, df['finish_time_hrs'], test_size=0.3,shuffle = False)
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test).flatten()
mse = np.mean((y_pred - y_test)**2)
print(f"Mean Squared Error: {mse}")
df_ = pd.DataFrame()
df_['avg_cardiac_cost'] = df.iloc[y_test.index[0]:]['avg_cardiac_cost']
df_['y_test'] = y_test
df_['y_pred'] = y_pred
df_ = df_.sort_values(by='avg_cardiac_cost')
#create scatterplot with regression line
sns.regplot(data = df_, x='avg_cardiac_cost', y = 'y_pred')
# returing test set and predictions
return y_test,y_pred,mse
The comparison of cardiac cost regression lines across different groups provides valuable insights into the relationship between cardiac cost and finish time. The horizontal regression line for the slow group indicates that there is no significant correlation between cardiac cost and finish time for this group. In contrast, the positive slope of the cardiac cost regression line for the medium group suggests that as the cardiac cost increases, the finish time also tends to increase, at a slower rate. The negative slope of the cardiac cost regression line for the fast group, on the other hand, indicates that as the cardiac cost increases, the finish time tends to decrease, suggesting that the fast group is more efficient in utilizing their cardiac cost to achieve faster finish times. Overall, these findings highlight the importance of monitoring cardiac cost during the marathon, as it can be a useful predictor of an athlete's performance, especially for the medium and fast groups.
In our study, we have used heart rate values throughout the marathon to predict finish time and plotted three regression lines for slow, medium, and fast groups of athletes. Interestingly, the regression lines for heart rate show a different trend to those of cardiac cost. The slow group has a negative slope regression line, while the medium group has a slightly positive slope and the fast group has a negative slope. This indicates that the fast group can maintain a lower heart rate while running at a faster pace than the slow and medium groups, which demonstrates their better cardiovascular fitness and ability to sustain high fractional use of heart rate. The results of our regression analysis can be used in polarised training and help athletes optimize their performance in future marathons.
df_results = pd.DataFrame([],columns = ["group","hr_error",'cc_error'])
df_slow = df_slow.reset_index(drop=True)
y_test,y_pred,cc_mse = cc_group_pred(df_slow)
df_slow_cc_results = df_slow.iloc[y_test.index[0]:].copy().reset_index(drop=True)
df_slow_cc_results['predicted_finish_time'] = y_pred
Extracted Cardiac Cost column and created a new dataframe by exploding the previous df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1866.666667 | 0.003253 | 0.002552 | 0.002704 | 0.003322 | 0.003421 | 0.003855 | 0.003787 | 0.002956 | 0.003177 | ... | 0.003781 | 0.003363 | 0.00335 | 0.003219 | 0.003505 | 0.003216 | 0.003409 | 0.003712 | 0.003037 | 0.003333 |
| 1 | 2400.000000 | 0.002694 | 0.003253 | 0.003216 | 0.003395 | 0.003579 | 0.003261 | 0.003747 | 0.002955 | 0.003751 | ... | 0.003781 | 0.003363 | 0.00335 | 0.003219 | 0.003505 | 0.003216 | 0.003409 | 0.003712 | 0.003037 | 0.003333 |
2 rows × 290 columns
dividing data into test train split Mean Squared Error: 0.18433761434206034
df_slow = df_slow.reset_index(drop=True)
y_test,y_pred,hr_mse = hr_group_pred(df_slow)
df_slow_hr_results = df_slow.iloc[y_test.index[0]:].copy().reset_index(drop=True)
df_slow_hr_results['predicted_finish_time'] = y_pred
df_slow_hr_results = df_slow_hr_results.sort_values(by='finish_time_hrs')
new_df = pd.DataFrame([["slow",hr_mse,cc_mse]],columns= ["group","hr_error",'cc_error'])
df_results = pd.concat([df_results, new_df], axis=0, ignore_index=True)
Extracted heart rate column and created a new dataframe by exploding the previous df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 112.0 | 149.3608 | 132.41661 | 143.04920 | 140.47089 | 140.53351 | 147.19073 | 145.00604 | 141.14948 | 138.20952 | ... | 126.98686 | 126.0 | 124.99988 | 125.0 | 125.0 | 125.0 | 125.0 | 125.0 | 125.025215 | 124.0 |
| 1 | 144.0 | 143.9577 | 142.98877 | 145.28488 | 143.91667 | 145.91734 | 133.09620 | 147.96850 | 147.93945 | 149.23276 | ... | 126.98686 | 126.0 | 124.99988 | 125.0 | 125.0 | 125.0 | 125.0 | 125.0 | 125.025215 | 124.0 |
2 rows × 290 columns
dividing data into test train split Mean Squared Error: 0.27460432596066603
# df_slow_hr_results[['finish_time_hrs','predicted_finish_time']]
def comparison_hr_cc(df_hr_results,df_cc_results):
df_hr_results = df_hr_results.rename(columns={'predicted_finish_time':'hr_predicted_finish_time'})
df_cc_results = df_cc_results.rename(columns={'predicted_finish_time':'cc_predicted_finish_time'})
hr_cols = ['hashedid','finish_time_hrs','hr_predicted_finish_time']
cc_cols = ['hashedid','cc_predicted_finish_time']
df_res = df_cc_results[cc_cols].merge(df_hr_results[hr_cols],how='inner')
df_res_columns = ['cc_predicted_finish_time','hr_predicted_finish_time']
fig = go.Figure()
# plotting the predictions
for i, algo in enumerate(df_res_columns):
fig.add_trace(
go.Scatter(
x=df_res['finish_time_hrs'],
y=df_res[algo],
mode='markers',
name=algo
)
)
fig.update_layout(
title='Comparison of Finish Time predictions using Cardiac Cost, Pace and Heart Rate',
xaxis_title='True Finish Time values',
yaxis_title='Predicted Finish Time values'
)
fig.show()
comparison_hr_cc(df_slow_hr_results,df_slow_cc_results)
df_med = df_med.reset_index(drop=True)
y_test,y_pred,cc_mse = cc_group_pred(df_med)
df_med_cc_results = df_med.iloc[y_test.index[0]:].copy().reset_index()
df_med_cc_results['predicted_finish_time'] = y_pred
Extracted Cardiac Cost column and created a new dataframe by exploding the previous df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2066.666667 | 0.003442 | 0.004382 | 0.005297 | 0.006202 | 0.005587 | 0.006239 | 0.005129 | 0.004978 | 0.007614 | ... | 0.005442 | 0.005516 | 0.005342 | 0.005300 | 0.005236 | 8.160218 | 0.005133 | 0.004766 | 0.005501 | 0.00487 |
| 1 | 1150.000000 | 0.002916 | 0.003388 | 0.004716 | 0.004223 | 0.003924 | 0.003875 | 0.004336 | 0.003435 | 0.004050 | ... | 0.004432 | 0.004754 | 0.004756 | 0.004668 | 0.004693 | 0.004732 | 0.005133 | 0.004766 | 0.005501 | 0.00487 |
2 rows × 216 columns
dividing data into test train split Mean Squared Error: 0.05121551416699187
y_test,y_pred,hr_mse = hr_group_pred(df_med)
df_med_hr_results = df_med.iloc[y_test.index[0]:].copy().reset_index()
df_med_hr_results['predicted_finish_time'] = y_pred
df_med_hr_results = df_med_hr_results.sort_values(by='finish_time_hrs')
# fig = px.line(df_med_hr_results, x="finish_time_hrs", y="predicted_finish_time", title='finish Time MSE')
# fig.show()
new_df = pd.DataFrame([["medium",hr_mse,cc_mse]],columns= ["group","hr_error",'cc_error'])
df_results = pd.concat([df_results, new_df], axis=0, ignore_index=True)
Extracted heart rate column and created a new dataframe by exploding the previous df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 124.0 | 138.00153 | 158.49796 | 172.15256 | 183.96542 | 176.97311 | 183.916340 | 182.61981 | 177.90056 | 180.461300 | ... | 167.99971 | 166.8640 | 170.06549 | 165.99829 | 164.90790 | 165.38415 | 166.331248 | 165.839582 | 167.466664 | 162.196475 |
| 1 | 69.0 | 118.00433 | 115.79638 | 117.72328 | 118.03809 | 117.01072 | 118.757034 | 124.00983 | 122.18063 | 123.031784 | ... | 153.74084 | 149.6435 | 153.02608 | 151.64230 | 154.37788 | 155.02983 | 166.331248 | 165.839582 | 167.466664 | 162.196475 |
2 rows × 216 columns
dividing data into test train split Mean Squared Error: 0.07764102109278646
comparison_hr_cc(df_med_hr_results,df_med_cc_results)
# df_med_cc_results
df_fast = df_fast.reset_index(drop=True)
y_test,y_pred,cc_mse = cc_group_pred(df_fast)
df_fast_cc_results = df_fast.iloc[y_test.index[0]:].copy().reset_index()
df_fast_cc_results['predicted_finish_time'] = y_pred
Extracted Cardiac Cost column and created a new dataframe by exploding the previous df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1800.000000 | 0.003761 | 0.004617 | 0.005372 | 0.005222 | 0.005516 | 0.004923 | 0.00521 | 0.005423 | 0.005762 | ... | 0.006561 | 0.006538 | 0.006529 | 0.006223 | 0.008609 | 0.005853 | 0.00625 | 0.005877 | 0.006591 | 0.005901 |
| 1 | 1183.333333 | 0.005611 | 0.006052 | 0.007811 | 0.006651 | 0.005979 | 0.006467 | 0.00705 | 0.006349 | 0.005976 | ... | 0.006428 | 0.006442 | 0.006320 | 0.006223 | 0.008609 | 0.005853 | 0.00625 | 0.005877 | 0.006591 | 0.005901 |
2 rows × 219 columns
dividing data into test train split Mean Squared Error: 0.055999146859345976
y_test,y_pred,hr_mse = hr_group_pred(df_fast)
df_fast_hr_results = df_fast.iloc[y_test.index[0]:].copy().reset_index()
df_fast_hr_results['predicted_finish_time'] = y_pred
df_fast_hr_results = df_fast_hr_results.sort_values(by='finish_time_hrs')
# fig = px.line(df_med_hr_results, x="finish_time_hrs", y="predicted_finish_time", title='finish Time MSE')
# fig.show()
new_df = pd.DataFrame([["fast",hr_mse,cc_mse]],columns= ["group","hr_error",'cc_error'])
df_results = pd.concat([df_results, new_df], axis=0, ignore_index=True)
Extracted heart rate column and created a new dataframe by exploding the previous df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 108.0 | 129.89734 | 134.17978 | 135.58492 | 141.13550 | 142.01509 | 146.00969 | 143.95483 | 145.96210 | 147.24023 | ... | 164.35478 | 165.96284 | 167.00540 | 164.874502 | 170.30544 | 156.95015 | 160.00215 | 156.04495 | 157.41348 | 155.5053 |
| 1 | 71.0 | 118.21266 | 133.98340 | 141.48727 | 138.82515 | 141.01483 | 140.07600 | 141.81734 | 144.17122 | 142.58566 | ... | 155.17902 | 153.91727 | 154.00504 | 164.874502 | 170.30544 | 156.95015 | 160.00215 | 156.04495 | 157.41348 | 155.5053 |
2 rows × 219 columns
dividing data into test train split Mean Squared Error: 0.07423839817618307
df_results
| group | hr_error | cc_error | |
|---|---|---|---|
| 0 | slow | 0.274604 | 0.184338 |
| 1 | medium | 0.077641 | 0.051216 |
| 2 | fast | 0.074238 | 0.055999 |
comparison_hr_cc(df_fast_hr_results,df_fast_cc_results)
# df_med_cc_results
The Below plot shows the Mean squared error comparison for different groups using both cardiac cost and heart rate. One possible explanation for the superior performance of cardiac cost could be that it is a combination of both heart rate and pace difference. Pace difference is a crucial variable in marathon performance as it reflects the ability of an athlete to maintain a consistent pace throughout the race. Cardiac cost, which has a unit corresponding to the amount of heartbeat per meter run, provides a more accurate measure of the cost of maintaining a certain pace. Therefore, cardiac cost captures both the physiological and performance aspects of marathon running, which might explain its superior performance compared to heart rate alone.
We also found that the best results were obtained for the medium group using cardiac cost. This finding is consistent with previous explanations, which has shown that the performance of the medium group is the most sensitive to cardiac cost. Also, our study highlights the importance of heart rate and cardiac cost in predicting marathon performance. By using time series data and regression analysis, we have shown that cardiac cost is a more effective predictor of marathon finish time than heart rate alone. The results of our study could have important implications for the training and preparation of marathon runners.
fig = px.histogram(df_results, x='group', y=['hr_error', 'cc_error'],
barmode='group', title='Histogram of Errors by Group')
fig.update_yaxes(title='MSE error')
fig.show()